This notebook is for preparing input files for pyclone-vi. The same files with a minor modification will be used as an input for FastClone.
suppressPackageStartupMessages({
library(tidyverse)
library(readxl)
library(cDriver) # Calculate CCF, https://github.com/hanasusak/cDriver/
})
# Detect the ".git" folder -- this will be in the project root directory
# Use this as the root directory to ensure proper sourcing of functions
# no matter where this is called from
root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
analysis_dir <- file.path(root_dir, "analyses", "tumor-clone-inference")
input_dir <- file.path(analysis_dir, "input")
data_dir <- file.path(root_dir, "data")
files_dir <- file.path(root_dir, "analyses", "sample-distribution-analysis", "results")
maf_files_dir <- file.path(root_dir, "analyses", "tmb-vaf-longitudinal", "results")
# Input files
#pbta_file <- file.path(files_dir, "pbta.tsv") # file from add-sample-distribution module
maf_file <- file.path(maf_files_dir, "tmb_vaf_genomic.tsv")
genomic_paired_file <- file.path(files_dir, "genomic_assays_matched_time_points.tsv") # file from add-sample-distribution module
nautilus_dec_file <- file.path(input_dir, "deceased_samples.xlsx")
palette_file <- file.path(root_dir, "figures", "palettes", "tumor_descriptor_color_palette.tsv")
# EXAMPLE
# But to replace with dir with cns data inferred by CNVkit
# PT_Z4BF2NSB_dir <- file.path(input_dir, "cnvkit_data_example/PT_Z4BF2NSB")
cns_dir <- file.path(input_dir, "cnvkit_data")
# File path to results directory
results_dir <-
file.path(analysis_dir, "results")
if (!dir.exists(results_dir)) {
dir.create(results_dir)
}
# File path to input directory
pyclonevi_input_dir <-
file.path(analysis_dir, "results", "pyclone-vi-input")
if (!dir.exists(pyclonevi_input_dir)) {
dir.create(pyclonevi_input_dir)
}
# File path to input directory
fastclone_input_dir <-
file.path(analysis_dir, "results", "fastclone-input")
if (!dir.exists(pyclonevi_input_dir)) {
dir.create(pyclonevi_input_dir)
}
# File path to plot directory
pyclone_plots_dir <-
file.path(analysis_dir, "plots", "pyclone-vi")
if (!dir.exists(pyclone_plots_dir)) {
dir.create(pyclone_plots_dir)
}
source(paste0(root_dir, "/figures/scripts/theme.R"))
# Read maf
maf_df <- readr::read_tsv(maf_file, guess_max = 100000, show_col_types = FALSE) %>%
select(Kids_First_Participant_ID, Kids_First_Biospecimen_ID, cg_id, tumor_descriptor,
sample_id, aliquot_id, Chromosome,
Start_Position, Reference_Allele, Tumor_Seq_Allele1, Tumor_Seq_Allele2,
t_ref_count, t_alt_count, tmb,
tumor_fraction, tumor_ploidy, VAF, Hugo_Symbol,
mutation_count) %>%
# Remove hypermutants
filter(!tmb >= 10,
# There are alterations with high number of read counts.
# We will exclude those with >1000 for now.
!t_alt_count >= 1000,
!t_ref_count >= 1000) %>%
# Add `normal_cn`: Total copy number of segment in healthy tissue.
# For autosome this will be two and male sex chromosomes one.
# See: https://github.com/Roth-Lab/pyclone-vi
mutate(normal_cn = case_when(grepl("chrY", Chromosome) ~ "1",
TRUE ~ "2"))
# Calculate CCF
maf_df <- CCF(maf_df)
# Read patient list
genomic_paired_df <- readr::read_tsv(genomic_paired_file, guess_max = 100000, show_col_types = FALSE) %>%
select(!c(cancer_group, experimental_strategy)) %>%
left_join(maf_df, by = c("Kids_First_Participant_ID")) %>% # create unique identifiers
dplyr::mutate(mutation_id = paste(Kids_First_Participant_ID, tumor_descriptor, Kids_First_Biospecimen_ID, Chromosome, Start_Position, Reference_Allele, Tumor_Seq_Allele2, sep = ":"),
cg_id_kids = paste(cg_id, Kids_First_Participant_ID, sep = "_"),
cg_id_kids = str_replace(cg_id_kids, "/|-", "_"),
cg_id_kids = str_replace_all(cg_id_kids, " ", "_"),
cg_id = str_replace(cg_id, "/|-", "_"),
cg_id = str_replace_all(cg_id, " ", "_"),
sample_id = paste(tumor_descriptor, Kids_First_Biospecimen_ID, sep = ":"))
# Let's count #specimens per sample
# We will remove any samples with less than 2 specimens as phylogenies require at least 3 taxa
kids_specimens_n_df <- genomic_paired_df %>%
select(Kids_First_Participant_ID, Kids_First_Biospecimen_ID, tumor_descriptor) %>%
unique() %>%
dplyr::count(Kids_First_Participant_ID) %>%
dplyr::mutate(kids_specimens_n = glue::glue("{Kids_First_Participant_ID} (N={n})")) %>%
dplyr::rename(kids_specimens_number = n) %>%
filter(!kids_specimens_number <= 2) %>%
left_join(genomic_paired_df, by = c("Kids_First_Participant_ID")) %>%
#left_join(maf_df) %>%
filter(!is.na(Chromosome))
# Let's confirm and add the number of timepoints per sample
# In case we lost any from filtering hypermutants and high reads/alteration
timepoints_n_df <- kids_specimens_n_df %>%
select(Kids_First_Participant_ID, tumor_descriptor) %>%
unique() %>%
dplyr::count(Kids_First_Participant_ID) %>%
dplyr::mutate(kids_timepoints_n = glue::glue("{Kids_First_Participant_ID} (N={n})")) %>%
dplyr::rename(kids_timepoints_number = n) %>%
filter(!kids_timepoints_number <= 1)
df <- timepoints_n_df %>%
left_join(kids_specimens_n_df, by = c("Kids_First_Participant_ID")) %>%
write_tsv(file.path(results_dir, "samples_eligible_for_phylogeny.tsv"))
# List with samples eligible for phylogeny
# I added the information about to use for phylogenetic inferences
list_df <- df %>%
select(Kids_First_Participant_ID) %>%
unique() %>%
mutate(somatic_germline_phylogeny = case_when(grepl("PT_KZ56XHJT|PT_KTRJ8TFY", Kids_First_Participant_ID) ~ "yes",
TRUE ~ "not_yet")) %>%
write_tsv(file.path(results_dir, "samples_eligible_for_phylogeny_list.tsv"))
nautilus_dec_df <- read_excel(nautilus_dec_file) %>%
right_join(df, by = c("sample_id", "aliquot_id", "tumor_descriptor")) %>%
write_tsv(file.path(results_dir, "nautilus_dec.tsv"))
list_df <- nautilus_dec_df %>%
select(Kids_First_Participant_ID, Kids_First_Biospecimen_ID, `Note field from Nautilus of initial parent`) %>%
unique() %>%
write_tsv(file.path(results_dir, "nautilus_dec_list.tsv"))
# Make list with samples with >2 biospecimens at Deceased timepoint
dec_n_df <- df %>%
filter(tumor_descriptor == "Deceased") %>%
select(Kids_First_Participant_ID, tumor_descriptor, Kids_First_Biospecimen_ID) %>%
unique() %>%
dplyr::count(Kids_First_Participant_ID) %>%
# dplyr::mutate(kids_dec_n = glue::glue("{Kids_First_Participant_ID} (N={n})")) %>%
dplyr::rename(kids_deceased_bs_number = n) %>%
filter(!kids_deceased_bs_number <= 1) %>%
write_tsv(file.path(results_dir, "kids_dec_multiple_bs_list.tsv"))
###-----------------------------------------------------
# let's look into PT_3KM9W8S8
PT_3KM9W8S8_df <- nautilus_dec_df %>%
filter(Kids_First_Participant_ID == "PT_3KM9W8S8",
Kids_First_Biospecimen_ID == "BS_2NQXY528") %>%
select(Kids_First_Participant_ID, Kids_First_Biospecimen_ID, `Note field from Nautilus of initial parent`) %>%
unique() %>%
write_tsv(file.path(results_dir, "nautilus_dec_list.tsv"))
#bs_id <- print(unique(PT_3KM9W8S8_df$Kids_First_Biospecimen_ID))
We need to generate the input files according to the method’s template. Phylogenetic methods require at least 2 samples per tumor site (multiregional sampling per anatomical site). Here, we will consider kids samples with more than 2 timepoints with one or more biospecimens. We will compare later differences in samples with single vs multiple biospecimens.
# Create pyclone df for all samples
pyclone_all_samples_df <- nautilus_dec_df %>%
select(Kids_First_Participant_ID, Kids_First_Biospecimen_ID, cg_id,
cg_id_kids, mutation_id, sample_id,
tumor_descriptor, Chromosome, Start_Position,
Reference_Allele, Tumor_Seq_Allele2, t_ref_count, t_alt_count,
normal_cn, tumor_fraction, mutation_count) %>%
# change names to match input requirements
dplyr::rename("ref_counts" = "t_ref_count",
"alt_counts" = "t_alt_count",
"tumour_content" = "tumor_fraction") %>%
select(Kids_First_Participant_ID, Kids_First_Biospecimen_ID, tumor_descriptor,
cg_id_kids, mutation_id, sample_id, ref_counts,
alt_counts, normal_cn, tumour_content, mutation_count)
# I will test one HGG dataset for now
# PT_Z4BF2NSB
#PT_Z4BF2NSB_DF <- pyclone_all_samples_df %>%
# filter(Kids_First_Participant_ID == "PT_Z4BF2NSB")
data_dir <- dir(path = cns_dir, pattern = ".call.cns", full.names = TRUE, recursive = TRUE)
# Create list
data_list <- list()
for (i in 1:length(data_dir) ) {
# Create sample_name
sample_name <- unique(as.character(gsub(".call.cns", "", str_split_fixed(data_dir[i], "/", 13)[,11])))
sample_name <- sort(sample_name, decreasing = FALSE)
print(sample_name)
for (x in seq_along(sample_name) ) {
data_list[[i]] <- read.csv(data_dir[i], header=T, sep="\t")
# Create file_name
file_name <- gsub(".call.cns", "", str_split_fixed(data_dir[i], "/", 13)[,12])
print(file_name)
data_list[[i]] <- data_list[[i]] %>%
mutate(Kids_First_Biospecimen_ID = file_name)
# The following code assigns name to df
# But adds an extra df into the list - dont know yet why but let's remove, it's not necessary
#df <- assign(file_name, data_list)
#data_list[[file_name]] <- df
}
}
[1] "PT_KTRJ8TFY"
[1] "BS_3VKW5988"
[1] "PT_KTRJ8TFY"
[1] "BS_402W79TS"
[1] "PT_KTRJ8TFY"
[1] "BS_5968GBGT"
[1] "PT_KTRJ8TFY"
[1] "BS_AF5D41PD"
[1] "PT_KTRJ8TFY"
[1] "BS_BQ81D2BP"
[1] "PT_KTRJ8TFY"
[1] "BS_EE73VE7V"
[1] "PT_KTRJ8TFY"
[1] "BS_HYKV2TH9"
[1] "PT_KTRJ8TFY"
[1] "BS_NGSG2KB6"
[1] "PT_KZ56XHJT"
[1] "BS_0ATJ22QA"
[1] "PT_KZ56XHJT"
[1] "BS_1Q524P3B"
[1] "PT_KZ56XHJT"
[1] "BS_21ET39G7"
[1] "PT_KZ56XHJT"
[1] "BS_22VCR7DF"
[1] "PT_KZ56XHJT"
[1] "BS_9DN4QR6E"
[1] "PT_KZ56XHJT"
[1] "BS_AK9BV52G"
[1] "PT_KZ56XHJT"
[1] "BS_D6STCMQS"
[1] "PT_KZ56XHJT"
[1] "BS_FCDAH728"
[1] "PT_KZ56XHJT"
[1] "BS_FWP8ZA4K"
[1] "PT_KZ56XHJT"
[1] "BS_H8NWA41N"
[1] "PT_KZ56XHJT"
[1] "BS_X5VN0FW0"
[1] "PT_KZ56XHJT"
[1] "BS_YHXMYDBN"
[1] "PT_Z4BF2NSB"
[1] "BS_2T4EJ6KN"
[1] "PT_Z4BF2NSB"
[1] "BS_537YFJ06"
[1] "PT_Z4BF2NSB"
[1] "BS_C6T8F9K7"
[1] "PT_Z4BF2NSB"
[1] "BS_EJP43CD9"
[1] "PT_Z4BF2NSB"
[1] "BS_GSXNQNRY"
[1] "PT_Z4BF2NSB"
[1] "BS_M29BNE7Z"
[1] "PT_Z4BF2NSB"
[1] "BS_MRN9VDQ0"
[1] "PT_Z4BF2NSB"
[1] "BS_W2QCHQ7E"
# Bind all df from list
data_list_bind <- dplyr::bind_rows(data_list)
# Create and save pyclone_input!
pyclone_input <- data_list_bind %>%
# Rename to match input format
# To figure out if the assignment is correct
dplyr::rename("major_cn" = "cn1",
"minor_cn" = "cn2") %>%
left_join(pyclone_all_samples_df) %>%
filter(!is.na(major_cn),
!is.na(minor_cn)) %>%
dplyr::mutate(mutation_id = paste(mutation_id, major_cn, minor_cn,sep = ":")) %>%
select(Kids_First_Participant_ID, tumor_descriptor, cg_id_kids, mutation_id, sample_id, ref_counts,
alt_counts, normal_cn, major_cn, minor_cn, tumour_content, mutation_count) %>%
# To ensure there are no duplicated entries in the dataframe
distinct()
Joining with `by = join_by(Kids_First_Biospecimen_ID)`Warning: Detected an unexpected many-to-many relationship between `x` and `y`.
for (i in 1:length(data_dir) ) {
# Create sample_name
sample_name <- unique(as.character(gsub(".call.cns", "", str_split_fixed(data_dir[i], "/", 13)[,11])))
sample_name <- sort(sample_name, decreasing = FALSE)
print(sample_name)
for (x in seq_along(sample_name) ) {
# Save input file for pyclone-vi
fname <- paste0(pyclonevi_input_dir, "/", sample_name[x], ".tsv")
print(fname)
pyclone_input_subset <- pyclone_input %>%
filter(Kids_First_Participant_ID == sample_name[x]) %>%
#select(-c(Kids_First_Participant_ID)) %>%
write_tsv(file.path(fname))
# Save input file for FastClone
fastclone_fname <- paste0(fastclone_input_dir, "/", sample_name[x], ".tsv")
print(fastclone_fname)
fastclone_input_subset <- pyclone_input %>%
dplyr::rename("var_counts" = "alt_counts") %>%
filter(Kids_First_Participant_ID == sample_name[x]) %>%
#select(-c(Kids_First_Participant_ID)) %>%
write_tsv(file.path(fastclone_fname))
}
}
[1] "PT_KTRJ8TFY"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/pyclone-vi-input/PT_KTRJ8TFY.tsv"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/fastclone-input/PT_KTRJ8TFY.tsv"
[1] "PT_KTRJ8TFY"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/pyclone-vi-input/PT_KTRJ8TFY.tsv"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/fastclone-input/PT_KTRJ8TFY.tsv"
[1] "PT_KTRJ8TFY"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/pyclone-vi-input/PT_KTRJ8TFY.tsv"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/fastclone-input/PT_KTRJ8TFY.tsv"
[1] "PT_KTRJ8TFY"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/pyclone-vi-input/PT_KTRJ8TFY.tsv"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/fastclone-input/PT_KTRJ8TFY.tsv"
[1] "PT_KTRJ8TFY"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/pyclone-vi-input/PT_KTRJ8TFY.tsv"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/fastclone-input/PT_KTRJ8TFY.tsv"
[1] "PT_KTRJ8TFY"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/pyclone-vi-input/PT_KTRJ8TFY.tsv"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/fastclone-input/PT_KTRJ8TFY.tsv"
[1] "PT_KTRJ8TFY"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/pyclone-vi-input/PT_KTRJ8TFY.tsv"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/fastclone-input/PT_KTRJ8TFY.tsv"
[1] "PT_KTRJ8TFY"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/pyclone-vi-input/PT_KTRJ8TFY.tsv"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/fastclone-input/PT_KTRJ8TFY.tsv"
[1] "PT_KZ56XHJT"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/pyclone-vi-input/PT_KZ56XHJT.tsv"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/fastclone-input/PT_KZ56XHJT.tsv"
[1] "PT_KZ56XHJT"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/pyclone-vi-input/PT_KZ56XHJT.tsv"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/fastclone-input/PT_KZ56XHJT.tsv"
[1] "PT_KZ56XHJT"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/pyclone-vi-input/PT_KZ56XHJT.tsv"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/fastclone-input/PT_KZ56XHJT.tsv"
[1] "PT_KZ56XHJT"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/pyclone-vi-input/PT_KZ56XHJT.tsv"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/fastclone-input/PT_KZ56XHJT.tsv"
[1] "PT_KZ56XHJT"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/pyclone-vi-input/PT_KZ56XHJT.tsv"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/fastclone-input/PT_KZ56XHJT.tsv"
[1] "PT_KZ56XHJT"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/pyclone-vi-input/PT_KZ56XHJT.tsv"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/fastclone-input/PT_KZ56XHJT.tsv"
[1] "PT_KZ56XHJT"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/pyclone-vi-input/PT_KZ56XHJT.tsv"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/fastclone-input/PT_KZ56XHJT.tsv"
[1] "PT_KZ56XHJT"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/pyclone-vi-input/PT_KZ56XHJT.tsv"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/fastclone-input/PT_KZ56XHJT.tsv"
[1] "PT_KZ56XHJT"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/pyclone-vi-input/PT_KZ56XHJT.tsv"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/fastclone-input/PT_KZ56XHJT.tsv"
[1] "PT_KZ56XHJT"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/pyclone-vi-input/PT_KZ56XHJT.tsv"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/fastclone-input/PT_KZ56XHJT.tsv"
[1] "PT_KZ56XHJT"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/pyclone-vi-input/PT_KZ56XHJT.tsv"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/fastclone-input/PT_KZ56XHJT.tsv"
[1] "PT_KZ56XHJT"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/pyclone-vi-input/PT_KZ56XHJT.tsv"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/fastclone-input/PT_KZ56XHJT.tsv"
[1] "PT_Z4BF2NSB"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/pyclone-vi-input/PT_Z4BF2NSB.tsv"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/fastclone-input/PT_Z4BF2NSB.tsv"
[1] "PT_Z4BF2NSB"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/pyclone-vi-input/PT_Z4BF2NSB.tsv"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/fastclone-input/PT_Z4BF2NSB.tsv"
[1] "PT_Z4BF2NSB"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/pyclone-vi-input/PT_Z4BF2NSB.tsv"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/fastclone-input/PT_Z4BF2NSB.tsv"
[1] "PT_Z4BF2NSB"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/pyclone-vi-input/PT_Z4BF2NSB.tsv"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/fastclone-input/PT_Z4BF2NSB.tsv"
[1] "PT_Z4BF2NSB"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/pyclone-vi-input/PT_Z4BF2NSB.tsv"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/fastclone-input/PT_Z4BF2NSB.tsv"
[1] "PT_Z4BF2NSB"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/pyclone-vi-input/PT_Z4BF2NSB.tsv"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/fastclone-input/PT_Z4BF2NSB.tsv"
[1] "PT_Z4BF2NSB"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/pyclone-vi-input/PT_Z4BF2NSB.tsv"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/fastclone-input/PT_Z4BF2NSB.tsv"
[1] "PT_Z4BF2NSB"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/pyclone-vi-input/PT_Z4BF2NSB.tsv"
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tumor-clone-inference/results/fastclone-input/PT_Z4BF2NSB.tsv"
# To find the position of duplicate elements in x, use this:
# duplicated(pyclone_input)
# Extract duplicate elements:
# pyclone_input[duplicated(pyclone_input)]
# Read color palette
palette_df <- readr::read_tsv(palette_file, guess_max = 100000, show_col_types = FALSE) %>%
mutate(tumor_descriptor = color_names)
# Define and order palette
palette <- palette_df$hex_codes
names(palette) <- palette_df$tumor_descriptor
# Define timepoints
timepoints = c("Diagnosis", "Progressive", "Recurrence", "Deceased", "Second Malignancy", "Unavailable")
for (i in 1:length(data_dir) ) {
# Create sample_name
sample_name <- unique(as.character(gsub(".call.cns", "", str_split_fixed(data_dir[i], "/", 13)[,11])))
sample_name <- sort(sample_name, decreasing = FALSE)
print(sample_name)
for (x in seq_along(sample_name) ) {
pyclone_input_subset <- pyclone_input %>%
filter(Kids_First_Participant_ID == sample_name[x]) %>%
select(-c(Kids_First_Participant_ID)) #%>%
# mutate(tumor_descriptor = factor(tumor_descriptor),
# tumor_descriptor = fct_relevel(tumor_descriptor, timepoints)) %>%
#arrange(tumor_descriptor, sample_id)
# Make this reproducible
set.seed(2023)
# Define label for plots
Timepoint <- factor(x = pyclone_input_subset$tumor_descriptor, levels = timepoints)
#######################
# Create bxp ref_counts
p <- print(ggplot(pyclone_input_subset, aes(sample_id, ref_counts, color = Timepoint)) +
geom_jitter(width = 0.15, size = 0.7, alpha = 0.6) +
ggplot2::geom_boxplot(color = "black",
size = 0.25,
alpha = 0,
coef = 0) + # remove whiskers
theme_Publication() +
scale_color_manual(values = palette,
breaks = sort(names(palette))) +
#rotate() +
theme(axis.text.x = element_text(angle = 90)) +
stat_summary(fun.y=mean,shape=1,col='black',geom='point') +
labs(title = sample_name[x],
x = "sample_id",
y = "ref_counts",
color = "Timepoint"))
# Save the plot
ggsave(filename = paste0(sample_name[x], "-ref_counts.pdf"),
path = pyclone_plots_dir,
width = 6,
height = 5,
device = "pdf",
useDingbats = FALSE)
#######################
# Create bxp alt_counts
p <- print(ggplot(pyclone_input_subset, aes(sample_id, alt_counts, color = Timepoint)) +
geom_jitter(width = 0.15, size = 0.7, alpha = 0.6) +
ggplot2::geom_boxplot(color = "black",
size = 0.25,
alpha = 0,
coef = 0) + # remove whiskers
theme_Publication() +
scale_color_manual(values = palette,
breaks = sort(names(palette))) +
#rotate() +
theme(axis.text.x = element_text(angle = 90)) +
stat_summary(fun.y=mean,shape=1,col='black',geom='point') +
labs(title = sample_name[x],
x = "sample_id",
y = "alt_counts",
color = "Timepoint"))
# Save the plot
ggsave(filename = paste0(sample_name[x], "-alt_counts.pdf"),
path = pyclone_plots_dir,
width = 6,
height = 5,
device = "pdf",
useDingbats = FALSE)
}
}
[1] "PT_KTRJ8TFY"
[1] "PT_KTRJ8TFY"
[1] "PT_KTRJ8TFY"
[1] "PT_KTRJ8TFY"
[1] "PT_KTRJ8TFY"
[1] "PT_KTRJ8TFY"
[1] "PT_KTRJ8TFY"
[1] "PT_KTRJ8TFY"
[1] "PT_KZ56XHJT"
[1] "PT_KZ56XHJT"
[1] "PT_KZ56XHJT"
[1] "PT_KZ56XHJT"
[1] "PT_KZ56XHJT"
[1] "PT_KZ56XHJT"
[1] "PT_KZ56XHJT"
[1] "PT_KZ56XHJT"
[1] "PT_KZ56XHJT"
[1] "PT_KZ56XHJT"
[1] "PT_KZ56XHJT"
[1] "PT_KZ56XHJT"
[1] "PT_Z4BF2NSB"
[1] "PT_Z4BF2NSB"
[1] "PT_Z4BF2NSB"
[1] "PT_Z4BF2NSB"
[1] "PT_Z4BF2NSB"
[1] "PT_Z4BF2NSB"
[1] "PT_Z4BF2NSB"
[1] "PT_Z4BF2NSB"
for (i in 1:length(data_dir) ) {
# Create sample_name
sample_name <- unique(as.character(gsub(".call.cns", "", str_split_fixed(data_dir[i], "/", 13)[,11])))
sample_name <- sort(sample_name, decreasing = FALSE)
print(sample_name)
for (x in seq_along(sample_name) ) {
pyclone_input_subset <- pyclone_input %>%
filter(Kids_First_Participant_ID == sample_name[x]) %>%
select(-c(Kids_First_Participant_ID))
# Define parameters for function
ylim = max(pyclone_input_subset$mutation_count)
# Rename legend for timepoints
Timepoint <- factor(pyclone_input_subset$tumor_descriptor)
# Plot stacked barplot
print(ggplot(pyclone_input_subset, aes(x = sample_id,
y = mutation_count,
fill = Timepoint)) +
geom_col(position = position_stack(reverse = TRUE)) +
geom_bar(stat = "identity", width = 0.5) +
scale_fill_manual(values = palette, breaks = sort(names(palette))) +
theme_Publication() +
theme(axis.text.x = element_text(angle = 85,
hjust = 1,
vjust = 1)) +
labs(title = paste(sample_name)) +
labs(x = "sample_id", y = "Total Mutations") +
ylim(0, ylim))
}
}
[1] "PT_KTRJ8TFY"
[1] "PT_KTRJ8TFY"
[1] "PT_KTRJ8TFY"
[1] "PT_KTRJ8TFY"
[1] "PT_KTRJ8TFY"
[1] "PT_KTRJ8TFY"
[1] "PT_KTRJ8TFY"
[1] "PT_KTRJ8TFY"
[1] "PT_KZ56XHJT"
[1] "PT_KZ56XHJT"
[1] "PT_KZ56XHJT"
[1] "PT_KZ56XHJT"
[1] "PT_KZ56XHJT"
[1] "PT_KZ56XHJT"
[1] "PT_KZ56XHJT"
[1] "PT_KZ56XHJT"
[1] "PT_KZ56XHJT"
[1] "PT_KZ56XHJT"
[1] "PT_KZ56XHJT"
[1] "PT_KZ56XHJT"
[1] "PT_Z4BF2NSB"
[1] "PT_Z4BF2NSB"
[1] "PT_Z4BF2NSB"
[1] "PT_Z4BF2NSB"
[1] "PT_Z4BF2NSB"
[1] "PT_Z4BF2NSB"
[1] "PT_Z4BF2NSB"
[1] "PT_Z4BF2NSB"
sessionInfo()
R version 4.3.1 (2023-06-16)
Platform: x86_64-apple-darwin20 (64-bit)
Running under: macOS Ventura 13.6.1
Matrix products: default
BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/lib/libRlapack.dylib; LAPACK version 3.11.0
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
time zone: America/New_York
tzcode source: internal
attached base packages:
[1] grid stats graphics grDevices utils datasets methods base
other attached packages:
[1] ggthemes_4.2.4 cDriver_0.4.2 readxl_1.4.3 fishplot_0.5.2 igraph_1.5.1 packcircles_0.3.6 gridBase_0.4-7 reshape2_1.4.4 gridExtra_2.3
[10] clonevol_0.99.11 lubridate_1.9.3 forcats_1.0.0 stringr_1.5.1 dplyr_1.1.4 purrr_1.0.2 readr_2.1.4 tidyr_1.3.0 tibble_3.2.1
[19] ggplot2_3.4.4 tidyverse_2.0.0
loaded via a namespace (and not attached):
[1] gtable_0.3.4 xfun_0.41 bslib_0.5.1 tzdb_0.4.0 vctrs_0.6.4 tools_4.3.1 generics_0.1.3 parallel_4.3.1 fansi_1.0.5
[10] pkgconfig_2.0.3 data.table_1.14.8 lifecycle_1.0.4 compiler_4.3.1 farver_2.1.1 textshaping_0.3.7 munsell_0.5.0 htmltools_0.5.7 sass_0.4.7
[19] gmp_0.7-2 yaml_2.3.7 pillar_1.9.0 crayon_1.5.2 jquerylib_0.1.4 cachem_1.0.8 tidyselect_1.2.0 digest_0.6.33 stringi_1.8.1
[28] labeling_0.4.3 rprojroot_2.0.4 fastmap_1.1.1 colorspace_2.1-0 cli_3.6.1 magrittr_2.0.3 utf8_1.2.4 withr_2.5.2 Rmpfr_0.9-3
[37] scales_1.2.1 plotrix_3.8-4 bit64_4.0.5 timechange_0.2.0 rmarkdown_2.25 bit_4.0.5 cellranger_1.1.0 ragg_1.2.6 png_0.1-8
[46] hms_1.1.3 evaluate_0.23 knitr_1.45 rlang_1.1.2 Rcpp_1.0.11 glue_1.6.2 rstudioapi_0.15.0 vroom_1.6.4 jsonlite_1.8.7
[55] R6_2.5.1 plyr_1.8.9 systemfonts_1.0.5